1 Load data

We download the data from segerstolpe et al, hosted [here]

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(stringr)
  library(scRNAseq)
})
sce <- BaronPancreasData()
# We select the largest batch to avoid pre-processing issues
# sce <- sce[, sce$donor == "GSM2230759"]
filt <- rowSums(counts(sce) >= 2) >= 10
sce <- sce[filt, ]
sce
## class: SingleCellExperiment 
## dim: 12336 8569 
## metadata(0):
## assays(1): counts
## rownames(12336): A1CF A2M ... ZZZ3 pk
## rowData names(0):
## colnames(8569): human1_lib1.final_cell_0001 human1_lib1.final_cell_0002
##   ... human4_lib3.final_cell_0700 human4_lib3.final_cell_0701
## colData names(2): donor label
## reducedDimNames(0):
## altExpNames(0):
rm(filt)

2 Pre-processing

The dataset can be normalized using SCVI

suppressPackageStartupMessages({
  library(Seurat)
})
pre_process_time <- system.time({
  se <- CreateSeuratObject(counts = counts(sce),
                           min.cells = 0,
                           min.features = 0,
                           project = "de")
  se <- AddMetaData(se, as.data.frame(colData(sce)))
  se <- NormalizeData(se, verbose = FALSE)
  se <- FindVariableFeatures(se, selection.method = 'vst', nfeatures = 4000,
                             verbose = FALSE)
  se <- se[VariableFeatures(se), ]
  se <- ScaleData(object = se, vars.to.regress = c("nCount_RNA", "donor"))
  sce <- as.SingleCellExperiment(se)
})
suppressPackageStartupMessages({
  library(reticulate)
})
use_python("/usr/local/bin/python3")
scvi <- import("scvi")
anndata <- import("anndata")
np <- import("numpy")
sc <- import("scanpy")
scvi_time <- system.time({
  adata <- anndata$AnnData(X = as.sparse(t(counts(sce))),
                           obs = data.frame(cells = colnames(sce),
                                            batch = sce$donor))
  scvi$data$setup_anndata(adata, batch_key = "batch")
  model <- scvi$model$SCVI(adata)
  model$train(n_epochs = as.integer(60), n_epochs_kl_warmup = as.integer(30))
})

We can visualize the data using the labels from the original publication

library(scater)
reducedDim(sce, "scvi") <- model$get_latent_representation()
denoised <- t(model$get_normalized_expression(adata, library_size = 10e4))
dimnames(denoised) <- dimnames(counts(sce))
assay(sce, "denoised") <- log1p(denoised)
sce <- runTSNE(sce, dimred = "scvi")
plotTSNE(sce, colour_by = "label")

plotTSNE(sce, colour_by = "donor")

3 Creating inputs to Dune

3.1 sc3

suppressPackageStartupMessages(library(SC3))
sc3_time <- system.time({
  sce_sc3 <- sce
  logcounts(sce_sc3) <- assay(sce, "denoised")
  rowData(sce_sc3)$feature_symbol <- rownames(sce_sc3)
  counts(sce_sc3) <- as.matrix(counts(sce_sc3))
  logcounts(sce_sc3) <- as.matrix(logcounts(sce_sc3))
  sce_sc3 <- sc3_estimate_k(sce_sc3)
  K <- metadata(sce_sc3)$sc3$k_estimation
  # Note: with R >= 4.0, RStudio and Mac OS, this can fails. 
  # A workaround is running 
  # parallel:::setDefaultClusterOptions(setup_strategy = "sequential")
  sce_sc3 <- sc3(sce_sc3, ks = K, n_cores = NCORES, rand_seed = 786907,
                 svm_num_cells = round(.5 * ncol(sce)))
  sce_sc3 <- sc3_run_svm(sce_sc3, ks = K)    
  sce$SC3 <- colData(sce_sc3)[, paste0("sc3_", K, "_clusters")] %>% as.factor()
})
plotTSNE(sce, colour_by = "SC3")

3.2 Seurat

seurat_time <- system.time({
  se <- RunPCA(se, verbose = FALSE)
  se <- FindNeighbors(se, verbose = FALSE)
  se <- FindClusters(object = se, verbose = FALSE)
  sce$seurat <- Idents(se)
})
plotTSNE(sce, colour_by = "seurat")

3.3 Seurat with SCVI

seurat_scvi_time <- system.time({
  seu <- as.Seurat(x = sce, counts = "counts", data = "counts")
  seu <- FindNeighbors(seu, reduction = "scvi", verbose = FALSE)
  seu <- FindClusters(object = seu, verbose = FALSE)
  sce$seurat_scvi <- Idents(seu)
})
plotTSNE(sce, colour_by = "seurat_scvi")

4 Running Dune

library(Dune)
df <- colData(sce)[, c("SC3", "seurat", "seurat_scvi")] %>%
  as.matrix()
dune_time <- system.time(
  merger <- Dune(clusMat = df, BPPARAM = BPPARAM, metric = "NMI")
)
plotPrePost(merger)

NMItrend(merger)

plotNMIs(merger$initialMat)

plotNMIs(merger$currentMat)

5 Picking the final clustering result to use

5.1 Visual pick

sce$seurat_Final <- as.factor(merger$currentMat$seurat)
plotTSNE(sce, colour_by = "seurat_Final")

sce$seurat_scvi_Final <- as.factor(merger$currentMat$seurat_scvi)
plotTSNE(sce, colour_by = "seurat_scvi_Final")

sce$SC3_Final <- as.factor(merger$currentMat$SC3)
plotTSNE(sce, colour_by = "SC3_Final")

5.2 Picking based on sihouette

library(cluster)
dist_mat <- dist(as.matrix(reducedDim(sce, "scvi")))
sils <- lapply(merger$currentMat %>% as.data.frame, function(label){
  silhouette(label, dist = dist_mat)[,3] %>%  mean()
}) %>% unlist()
sils
##         SC3      seurat seurat_scvi 
## -0.03325991  0.18725471  0.31371226

6 Runtimes

times <- c(pre_process_time[1],
           scvi_time[1],
           sc3_time[1],
           seurat_time[1],
           seurat_scvi_time[1],
           dune_time[1])
names(times) <- c("Pre-Process",
                  "SCVI",
                  "Sc3",
                  "Seurat",
                  "Seurat after SCVI",
                  "Dune")
df <- data.frame(times = times,
                 Name = factor(names(times), levels = names(times)))
ggplot(df, aes(x = Name, y = times, fill = Name)) +
  geom_col() +
  theme_classic() +
  labs(x = "Step", y = "Time (second)") +
  scale_color_brewer(palette = "Dark2") +
  scale_y_log10()

LS0tCmF1dGhvcjogIkhlY3RvciBSb3V4IGRlIELDqXppZXV4IgpkYXRlOiAnYHIgZm9ybWF0KFN5cy50aW1lKCksICIlZCAlQiAsICVZIilgJwp0aXRsZTogJ0R1bmUgd29ya2Zsb3cgb24gdGhlIGJhcm9uIGRhdGFzZXQnCm91dHB1dDoKICBodG1sX2RvY3VtZW50OgogICAgdG9jOiB0cnVlCiAgICB0b2NfZmxvYXQ6IFRSVUUKICAgIHRvY19kZXB0aDogMgogICAgbnVtYmVyX3NlY3Rpb25zOiB0cnVlCiAgICBjb2RlX2Rvd25sb2FkOiBUUlVFCi0tLQoKYGBge3IgbG9hZCBwYWNrYWdlcywgaW5jbHVkZT1GfQpsaWJyYXJ5KGtuaXRyKQpvcHRzX2NodW5rJHNldCgKICBmaWcucG9zID0gIiFoIiwgb3V0LmV4dHJhID0gIiIsIHdhcm5pbmcgPSBGLCBtZXNzYWdlID0gRiwKICBmaWcuYWxpZ24gPSAiY2VudGVyIgopCk5DT1JFUyA8LSAyCmBgYAoKIyBMb2FkIGRhdGEKCldlIGRvd25sb2FkIHRoZSBkYXRhIGZyb20gc2VnZXJzdG9scGUgZXQgYWwsIGhvc3RlZCBbaGVyZV0KCmBgYHtyfQpzdXBwcmVzc1BhY2thZ2VTdGFydHVwTWVzc2FnZXMoewogIGxpYnJhcnkoU2luZ2xlQ2VsbEV4cGVyaW1lbnQpCiAgbGlicmFyeShzdHJpbmdyKQogIGxpYnJhcnkoc2NSTkFzZXEpCn0pCnNjZSA8LSBCYXJvblBhbmNyZWFzRGF0YSgpCiMgV2Ugc2VsZWN0IHRoZSBsYXJnZXN0IGJhdGNoIHRvIGF2b2lkIHByZS1wcm9jZXNzaW5nIGlzc3VlcwojIHNjZSA8LSBzY2VbLCBzY2UkZG9ub3IgPT0gIkdTTTIyMzA3NTkiXQpmaWx0IDwtIHJvd1N1bXMoY291bnRzKHNjZSkgPj0gMikgPj0gMTAKc2NlIDwtIHNjZVtmaWx0LCBdCnNjZQpybShmaWx0KQpgYGAKCiMgUHJlLXByb2Nlc3NpbmcKClRoZSBkYXRhc2V0IGNhbiBiZSBub3JtYWxpemVkIHVzaW5nIFNDVkkgCgpgYGB7cn0Kc3VwcHJlc3NQYWNrYWdlU3RhcnR1cE1lc3NhZ2VzKHsKICBsaWJyYXJ5KFNldXJhdCkKfSkKcHJlX3Byb2Nlc3NfdGltZSA8LSBzeXN0ZW0udGltZSh7CiAgc2UgPC0gQ3JlYXRlU2V1cmF0T2JqZWN0KGNvdW50cyA9IGNvdW50cyhzY2UpLAogICAgICAgICAgICAgICAgICAgICAgICAgICBtaW4uY2VsbHMgPSAwLAogICAgICAgICAgICAgICAgICAgICAgICAgICBtaW4uZmVhdHVyZXMgPSAwLAogICAgICAgICAgICAgICAgICAgICAgICAgICBwcm9qZWN0ID0gImRlIikKICBzZSA8LSBBZGRNZXRhRGF0YShzZSwgYXMuZGF0YS5mcmFtZShjb2xEYXRhKHNjZSkpKQogIHNlIDwtIE5vcm1hbGl6ZURhdGEoc2UsIHZlcmJvc2UgPSBGQUxTRSkKICBzZSA8LSBGaW5kVmFyaWFibGVGZWF0dXJlcyhzZSwgc2VsZWN0aW9uLm1ldGhvZCA9ICd2c3QnLCBuZmVhdHVyZXMgPSA0MDAwLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgIHZlcmJvc2UgPSBGQUxTRSkKICBzZSA8LSBzZVtWYXJpYWJsZUZlYXR1cmVzKHNlKSwgXQogIHNlIDwtIFNjYWxlRGF0YShvYmplY3QgPSBzZSwgdmFycy50by5yZWdyZXNzID0gYygibkNvdW50X1JOQSIsICJkb25vciIpKQogIHNjZSA8LSBhcy5TaW5nbGVDZWxsRXhwZXJpbWVudChzZSkKfSkKYGBgCgpgYGB7ciwgd2FybmluZz1GQUxTRX0Kc3VwcHJlc3NQYWNrYWdlU3RhcnR1cE1lc3NhZ2VzKHsKICBsaWJyYXJ5KHJldGljdWxhdGUpCn0pCnVzZV9weXRob24oIi91c3IvbG9jYWwvYmluL3B5dGhvbjMiKQpzY3ZpIDwtIGltcG9ydCgic2N2aSIpCmFubmRhdGEgPC0gaW1wb3J0KCJhbm5kYXRhIikKbnAgPC0gaW1wb3J0KCJudW1weSIpCnNjIDwtIGltcG9ydCgic2NhbnB5IikKc2N2aV90aW1lIDwtIHN5c3RlbS50aW1lKHsKICBhZGF0YSA8LSBhbm5kYXRhJEFubkRhdGEoWCA9IGFzLnNwYXJzZSh0KGNvdW50cyhzY2UpKSksCiAgICAgICAgICAgICAgICAgICAgICAgICAgIG9icyA9IGRhdGEuZnJhbWUoY2VsbHMgPSBjb2xuYW1lcyhzY2UpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIGJhdGNoID0gc2NlJGRvbm9yKSkKICBzY3ZpJGRhdGEkc2V0dXBfYW5uZGF0YShhZGF0YSwgYmF0Y2hfa2V5ID0gImJhdGNoIikKICBtb2RlbCA8LSBzY3ZpJG1vZGVsJFNDVkkoYWRhdGEpCiAgbW9kZWwkdHJhaW4obl9lcG9jaHMgPSBhcy5pbnRlZ2VyKDYwKSwgbl9lcG9jaHNfa2xfd2FybXVwID0gYXMuaW50ZWdlcigzMCkpCn0pCmBgYAoKV2UgY2FuIHZpc3VhbGl6ZSB0aGUgZGF0YSB1c2luZyB0aGUgbGFiZWxzIGZyb20gdGhlIG9yaWdpbmFsIHB1YmxpY2F0aW9uCgpgYGB7cn0KbGlicmFyeShzY2F0ZXIpCnJlZHVjZWREaW0oc2NlLCAic2N2aSIpIDwtIG1vZGVsJGdldF9sYXRlbnRfcmVwcmVzZW50YXRpb24oKQpkZW5vaXNlZCA8LSB0KG1vZGVsJGdldF9ub3JtYWxpemVkX2V4cHJlc3Npb24oYWRhdGEsIGxpYnJhcnlfc2l6ZSA9IDEwZTQpKQpkaW1uYW1lcyhkZW5vaXNlZCkgPC0gZGltbmFtZXMoY291bnRzKHNjZSkpCmFzc2F5KHNjZSwgImRlbm9pc2VkIikgPC0gbG9nMXAoZGVub2lzZWQpCnNjZSA8LSBydW5UU05FKHNjZSwgZGltcmVkID0gInNjdmkiKQpwbG90VFNORShzY2UsIGNvbG91cl9ieSA9ICJsYWJlbCIpCnBsb3RUU05FKHNjZSwgY29sb3VyX2J5ID0gImRvbm9yIikKYGBgCgojIENyZWF0aW5nIGlucHV0cyB0byBEdW5lCiMjIHNjMwoKYGBge3IsIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CnN1cHByZXNzUGFja2FnZVN0YXJ0dXBNZXNzYWdlcyhsaWJyYXJ5KFNDMykpCnNjM190aW1lIDwtIHN5c3RlbS50aW1lKHsKICBzY2Vfc2MzIDwtIHNjZQogIGxvZ2NvdW50cyhzY2Vfc2MzKSA8LSBhc3NheShzY2UsICJkZW5vaXNlZCIpCiAgcm93RGF0YShzY2Vfc2MzKSRmZWF0dXJlX3N5bWJvbCA8LSByb3duYW1lcyhzY2Vfc2MzKQogIGNvdW50cyhzY2Vfc2MzKSA8LSBhcy5tYXRyaXgoY291bnRzKHNjZV9zYzMpKQogIGxvZ2NvdW50cyhzY2Vfc2MzKSA8LSBhcy5tYXRyaXgobG9nY291bnRzKHNjZV9zYzMpKQogIHNjZV9zYzMgPC0gc2MzX2VzdGltYXRlX2soc2NlX3NjMykKICBLIDwtIG1ldGFkYXRhKHNjZV9zYzMpJHNjMyRrX2VzdGltYXRpb24KICAjIE5vdGU6IHdpdGggUiA+PSA0LjAsIFJTdHVkaW8gYW5kIE1hYyBPUywgdGhpcyBjYW4gZmFpbHMuIAogICMgQSB3b3JrYXJvdW5kIGlzIHJ1bm5pbmcgCiAgIyBwYXJhbGxlbDo6OnNldERlZmF1bHRDbHVzdGVyT3B0aW9ucyhzZXR1cF9zdHJhdGVneSA9ICJzZXF1ZW50aWFsIikKICBzY2Vfc2MzIDwtIHNjMyhzY2Vfc2MzLCBrcyA9IEssIG5fY29yZXMgPSBOQ09SRVMsIHJhbmRfc2VlZCA9IDc4NjkwNywKICAgICAgICAgICAgICAgICBzdm1fbnVtX2NlbGxzID0gcm91bmQoLjUgKiBuY29sKHNjZSkpKQogIHNjZV9zYzMgPC0gc2MzX3J1bl9zdm0oc2NlX3NjMywga3MgPSBLKSAgICAKICBzY2UkU0MzIDwtIGNvbERhdGEoc2NlX3NjMylbLCBwYXN0ZTAoInNjM18iLCBLLCAiX2NsdXN0ZXJzIildICU+JSBhcy5mYWN0b3IoKQp9KQpwbG90VFNORShzY2UsIGNvbG91cl9ieSA9ICJTQzMiKQpgYGAKCiMjIFNldXJhdCAKCmBgYHtyfQpzZXVyYXRfdGltZSA8LSBzeXN0ZW0udGltZSh7CiAgc2UgPC0gUnVuUENBKHNlLCB2ZXJib3NlID0gRkFMU0UpCiAgc2UgPC0gRmluZE5laWdoYm9ycyhzZSwgdmVyYm9zZSA9IEZBTFNFKQogIHNlIDwtIEZpbmRDbHVzdGVycyhvYmplY3QgPSBzZSwgdmVyYm9zZSA9IEZBTFNFKQogIHNjZSRzZXVyYXQgPC0gSWRlbnRzKHNlKQp9KQpwbG90VFNORShzY2UsIGNvbG91cl9ieSA9ICJzZXVyYXQiKQpgYGAKCiMjIFNldXJhdCB3aXRoIFNDVkkKCmBgYHtyfQpzZXVyYXRfc2N2aV90aW1lIDwtIHN5c3RlbS50aW1lKHsKICBzZXUgPC0gYXMuU2V1cmF0KHggPSBzY2UsIGNvdW50cyA9ICJjb3VudHMiLCBkYXRhID0gImNvdW50cyIpCiAgc2V1IDwtIEZpbmROZWlnaGJvcnMoc2V1LCByZWR1Y3Rpb24gPSAic2N2aSIsIHZlcmJvc2UgPSBGQUxTRSkKICBzZXUgPC0gRmluZENsdXN0ZXJzKG9iamVjdCA9IHNldSwgdmVyYm9zZSA9IEZBTFNFKQogIHNjZSRzZXVyYXRfc2N2aSA8LSBJZGVudHMoc2V1KQp9KQpwbG90VFNORShzY2UsIGNvbG91cl9ieSA9ICJzZXVyYXRfc2N2aSIpCmBgYAoKIyBSdW5uaW5nIER1bmUKCmBgYHtyfQpsaWJyYXJ5KER1bmUpCmRmIDwtIGNvbERhdGEoc2NlKVssIGMoIlNDMyIsICJzZXVyYXQiLCAic2V1cmF0X3NjdmkiKV0gJT4lCiAgYXMubWF0cml4KCkKZHVuZV90aW1lIDwtIHN5c3RlbS50aW1lKAogIG1lcmdlciA8LSBEdW5lKGNsdXNNYXQgPSBkZiwgQlBQQVJBTSA9IEJQUEFSQU0sIG1ldHJpYyA9ICJOTUkiKQopCmBgYAoKYGBge3J9CnBsb3RQcmVQb3N0KG1lcmdlcikKTk1JdHJlbmQobWVyZ2VyKQpgYGAKCmBgYHtyfQpwbG90Tk1JcyhtZXJnZXIkaW5pdGlhbE1hdCkKcGxvdE5NSXMobWVyZ2VyJGN1cnJlbnRNYXQpCmBgYAoKIyBQaWNraW5nIHRoZSBmaW5hbCBjbHVzdGVyaW5nIHJlc3VsdCB0byB1c2UKIyMgVmlzdWFsIHBpY2sKCmBgYHtyfQpzY2Ukc2V1cmF0X0ZpbmFsIDwtIGFzLmZhY3RvcihtZXJnZXIkY3VycmVudE1hdCRzZXVyYXQpCnBsb3RUU05FKHNjZSwgY29sb3VyX2J5ID0gInNldXJhdF9GaW5hbCIpCmBgYAoKYGBge3J9CnNjZSRzZXVyYXRfc2N2aV9GaW5hbCA8LSBhcy5mYWN0b3IobWVyZ2VyJGN1cnJlbnRNYXQkc2V1cmF0X3NjdmkpCnBsb3RUU05FKHNjZSwgY29sb3VyX2J5ID0gInNldXJhdF9zY3ZpX0ZpbmFsIikKYGBgCgpgYGB7cn0Kc2NlJFNDM19GaW5hbCA8LSBhcy5mYWN0b3IobWVyZ2VyJGN1cnJlbnRNYXQkU0MzKQpwbG90VFNORShzY2UsIGNvbG91cl9ieSA9ICJTQzNfRmluYWwiKQpgYGAKCiMjIFBpY2tpbmcgYmFzZWQgb24gc2lob3VldHRlCgpgYGB7cn0KbGlicmFyeShjbHVzdGVyKQpkaXN0X21hdCA8LSBkaXN0KGFzLm1hdHJpeChyZWR1Y2VkRGltKHNjZSwgInNjdmkiKSkpCnNpbHMgPC0gbGFwcGx5KG1lcmdlciRjdXJyZW50TWF0ICU+JSBhcy5kYXRhLmZyYW1lLCBmdW5jdGlvbihsYWJlbCl7CiAgc2lsaG91ZXR0ZShsYWJlbCwgZGlzdCA9IGRpc3RfbWF0KVssM10gJT4lICBtZWFuKCkKfSkgJT4lIHVubGlzdCgpCnNpbHMKYGBgCgojIFJ1bnRpbWVzCgpgYGB7cn0KdGltZXMgPC0gYyhwcmVfcHJvY2Vzc190aW1lWzFdLAogICAgICAgICAgIHNjdmlfdGltZVsxXSwKICAgICAgICAgICBzYzNfdGltZVsxXSwKICAgICAgICAgICBzZXVyYXRfdGltZVsxXSwKICAgICAgICAgICBzZXVyYXRfc2N2aV90aW1lWzFdLAogICAgICAgICAgIGR1bmVfdGltZVsxXSkKbmFtZXModGltZXMpIDwtIGMoIlByZS1Qcm9jZXNzIiwKICAgICAgICAgICAgICAgICAgIlNDVkkiLAogICAgICAgICAgICAgICAgICAiU2MzIiwKICAgICAgICAgICAgICAgICAgIlNldXJhdCIsCiAgICAgICAgICAgICAgICAgICJTZXVyYXQgYWZ0ZXIgU0NWSSIsCiAgICAgICAgICAgICAgICAgICJEdW5lIikKZGYgPC0gZGF0YS5mcmFtZSh0aW1lcyA9IHRpbWVzLAogICAgICAgICAgICAgICAgIE5hbWUgPSBmYWN0b3IobmFtZXModGltZXMpLCBsZXZlbHMgPSBuYW1lcyh0aW1lcykpKQpnZ3Bsb3QoZGYsIGFlcyh4ID0gTmFtZSwgeSA9IHRpbWVzLCBmaWxsID0gTmFtZSkpICsKICBnZW9tX2NvbCgpICsKICB0aGVtZV9jbGFzc2ljKCkgKwogIGxhYnMoeCA9ICJTdGVwIiwgeSA9ICJUaW1lIChzZWNvbmQpIikgKwogIHNjYWxlX2NvbG9yX2JyZXdlcihwYWxldHRlID0gIkRhcmsyIikgKwogIHNjYWxlX3lfbG9nMTAoKQpgYGA=